# -*- coding: utf-8 -*-
import scrapy
from ..settings import DEFAULT_REQUEST_HEADERS
from ..items import GovernmentprojectItem
class GovernmentSpider(scrapy.Spider):
    name = 'government'
    allowed_domains = ['sousuo.gov.cn']
    start_urls = ['http://sousuo.gov.cn/column/31421/0.htm']

    def start_requests(self):
        url = self.start_urls[0]
        yield scrapy.Request(url,callback=self.parse_info,headers=DEFAULT_REQUEST_HEADERS,dont_filter=True)

    def parse_info(self,response):
        # 新闻超链接
        item = GovernmentprojectItem()
        link_urls = response.xpath('//h4/a/@href').extract()
        for link_url in link_urls:
            item['link_url'] = link_url
            yield item

        url1 = 'http://sousuo.gov.cn/column/31421/'
        for i in range(1, 3):
            url = url1 + '%d.htm' % (i)
            yield scrapy.Request(url, callback=self.parse_info)
